8.1 RM-GF correlations tests


In [ ]:
%run "../Functions/8. RM-GF correlations.ipynb"

In [ ]:
allData = allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.copy()

Correlation between max chapter and answers

method 1: correlation matrix

index: question groups

columns: RedMetrics parameters


In [ ]:
#def getScoresOnQuestionsFromAllData(allData, Qs):

method 2

use max chapter or <= max chapter?


In [ ]:
correctPerMaxChapter = pd.DataFrame(index = posttestScientificQuestions, columns = range(15))

In [ ]:
allData.loc[:, allData.loc['maxChapter', :] == 10].columns

In [ ]:
# when reaching checkpoint N, what is the rate of good answer for question Q?
maxCheckpointsDF = pd.DataFrame(index = ['maxCh'], columns=range(15))

for chapter in allData.loc['maxChapter', :].unique():
    eltsCount = len(allData.loc[:, allData.loc['maxChapter', :] == chapter].columns)
    maxCheckpointsDF.loc['maxCh', chapter] = eltsCount
    for q in posttestScientificQuestions:
        interestingElts = allData.loc[q, allData.loc['maxChapter', :] == chapter]
        scoreSum = interestingElts.sum()
        correctPerMaxChapter.loc[q, chapter] = int(scoreSum * 100 / eltsCount)
correctPerMaxChapterNotNan = correctPerMaxChapter.fillna(-1)

_fig1 = plt.figure(figsize=(20,20))
_ax1 = plt.subplot(111)
_ax1.set_title("maxCheckpointsDF")
sns.heatmap(
    correctPerMaxChapterNotNan,
    ax=_ax1,
    cmap=plt.cm.jet,
    square=True,
    annot=True,
    fmt='d',
)


maxCheckpointsDFNotNan = maxCheckpointsDF.fillna(0)

_fig2 = plt.figure(figsize=(14,2))
_ax2 = plt.subplot(111)
_ax2.set_title("maxCheckpointsDF")
sns.heatmap(
            maxCheckpointsDFNotNan,
            ax=_ax2,
            cmap=plt.cm.jet,
            square=True,
            annot=True,
            fmt='d',
        )

In [ ]:
corrChapterScQDF = pd.DataFrame(index=posttestScientificQuestions, columns=['corr'])

# when reaching checkpoint N, what is the rate of good answer for question Q?
for q in posttestScientificQuestions:
    corrChapterScQDF.loc[q, 'corr'] = np.corrcoef(allData.loc[q,:].values, allData.loc['maxChapter',:].values)[1,0]

corrChapterScQDFNotNan = corrChapterScQDF.fillna(-2)

_fig1 = plt.figure(figsize=(14,10))
_ax1 = plt.subplot(111)
_ax1.set_title("corrChapterScQDFNotNan")
sns.heatmap(
            corrChapterScQDFNotNan,
            ax=_ax1,
            cmap=plt.cm.jet,
            square=True,
            annot=True,
            fmt='.2f',
            vmin=-1,
            vmax=1,
        )

In [ ]:

Clustering answers to find underlying correlation with RedMetrics data


In [ ]:
from sklearn.cluster import KMeans
from sklearn.neighbors.kde import KernelDensity

In [ ]:
X = np.array([[0.9], [1], [1.1], [4], [4.1], [4.2], [5]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.inertia_

In [ ]:
kmeans.labels_

In [ ]:
kmeans.cluster_centers_

In [ ]:
kmeans.predict([[3], [4]])

In [ ]:
inertiaThreshold = 1

In [ ]:
for question in scientificQuestions:
    posttestQuestion = answerTemporalities[1] + " " + question
    #deltaQuestion = delta + " " + question
    allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[posttestQuestion, :]

In [ ]:
X = [[x] for x in allDataPlaytestPhase1PretestPosttestUniqueProfilesVolunteers.loc[posttestQuestion, :].values]
clusterCount = 3
kmeans = KMeans(n_clusters=clusterCount, random_state=0).fit(X)
if len(np.unique(kmeans.labels_)) != clusterCount:
    print("incorrect number of clusters")
kmeans.inertia_

In [ ]:

Clustering using KernelDensity


In [ ]:
X = np.array([[-1], [-2], [-3], [1], [2], [3]])
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X)
kde.score_samples(X)

In [ ]:
X = np.array([-1, -2, -3, 1, 2, 3])
kde = KernelDensity(kernel='gaussian', bandwidth=0.2).fit(X.reshape(-1, 1))
kde.score_samples(X.reshape(-1, 1))
X.reshape(-1, 1)